#!/usr/bin/python
# Author: Jimmy Saw
# Date: 01/04/2012
# This script consolidates gene coordinates predicted by different gene finders
# and keep the ones that agree with each other. Prints the ones that don't agree
# so that manual curation can be done.
# Usage: consolidate_orfs.py predict1.txt predict2.txt
# Needs two files:
# 1. Glimmer predict file in tab-delimited form
# 2. Prodigal out file in GFF format
# NOTE: TO BE CONTINUED. NOT FINISHED YET. STOPPED BECAUSE NOT NECESSARY FOR CURRENT GENOME ANNOTATION.

import sys
import re
from Bio import SeqIO

plus = re.compile('\+.*')
minus = re.compile('\-.*')

glimmerfile = open(sys.argv[1], "rU")
gfl = glimmerfile.readlines()

prodigalfile = open(sys.argv[2], "rU")
pfl = prodigalfile.readlines()

glimmerorfs = []

for a in gfl[1:]:
    g = a.split('\t')
    strand = ""
    if plus.match(g[3]):
        strand = "+"
        glimmerorfs.append((g[0], int(g[1]), int(g[2]), strand))
    else:
        strand = "-"
        glimmerorfs.append((g[0], int(g[2]), int(g[1]), strand))

prodigalorfs = []
matches = []

for i, b in enumerate(pfl[3:]):
    p = b.split('\t')
    recid = p[0] + str(i).zfill(4)
    prodigalorfs.append((recid, int(p[3]), int(p[4]), p[6]))

i = 0
bothmatch = 0
shorter = 0
longer = 0
bothdontmatch = 0

while i < len(glimmerorfs):
    for index, j in enumerate(prodigalorfs):
        if prodigalorfs[index][2] == glimmerorfs[i][2]:
            if prodigalorfs[index][1] == glimmerorfs[i][1]:
                #print prodigalorfs[index][0], prodigalorfs[index][1], prodigalorfs[index][2], "Start and Stop"
                bothmatch += 1
            else:
                if prodigalorfs[index][1] > glimmerorfs[i][1]:
                    #print prodigalorfs[index][0], prodigalorfs[index][1], prodigalorfs[index][2], "Prodigal start shorter"
                    shorter += 1
                else:
                    #print prodigalorfs[index][0], prodigalorfs[index][1], prodigalorfs[index][2], "Prodigal start longer"
                    longer += 1
        else:
            pass
    bothdontmatch += 1
    i += 1

total = bothmatch + shorter + longer + bothdontmatch

print "Glimmer", str(len(glimmerorfs))
print "Prodigal", str(len(prodigalorfs))
print "Matches", str(bothmatch)
print "Total Glimmer", str(total)
glimmerfile.close()
prodigalfile.close()
